---
title: "Power Analysis"
author: ""
date: "2024-11-09"
output:
  pdf_document: default
  html_document: default
---

## Packages used:

```{r, echo = T, results = 'hide',  error=FALSE, warning=FALSE, message=FALSE}
rm(list = ls())
library(tidyverse)
library(ggplot2)
library(ggrepel)
library(estimatr)
library(dotwhisker)
library(broom)
library(dplyr)
library(sjPlot)
library(hrbrthemes)
library(ivreg)
```

## Simulating data. 

We simulate data using the variable names we will use to code our variables in 
the main analysis. Note we generate simulated data from two studies, and include 
two treatment groups. 


```{r,echo = T, results = 'hide',  error=FALSE, warning=FALSE, message=FALSE}
set.seed(3)
N = 1000 # Observations. 

##### Study 1:
## Generating uncorrelated data:
dat <- tibble(ID = 1:N,
    T1 = rbinom(N,1, .5),
    T2 = rbinom(N,1, .5),
    SI_1 = sample.int(7, N, replace = TRUE),
    SI_2 = sample.int(7, N, replace = TRUE),
    SI_3 = sample.int(7, N, replace = TRUE),
    SI_4 = sample.int(7, N, replace = TRUE),
    SI_5 = sample.int(6, N, replace = TRUE),
    SI_6 = sample.int(7, N, replace = TRUE),
    MN_1 = sample.int(7, N, replace = TRUE),
    MT_1 = sample.int(7, N, replace = TRUE),
    AM_1 = sample.int(7, N, replace = TRUE),
    AM_2 = sample.int(7, N, replace = TRUE),
    AM_3 = sample.int(7, N, replace = TRUE),
    HTE_1 = sample.int(5, N, replace = TRUE),
    HTE_2 = sample.int(3, N, replace = TRUE),
    HTE_3 = sample.int(2, N, replace = TRUE),
    HTE_4 = sample.int(2, N, replace = TRUE),
    HTE_5 = sample.int(7, N, replace = TRUE),
    HTE_6 = sample.int(7, N, replace = TRUE),
    ctrl_1 = sample.int(3, N, replace = TRUE),
    ctrl_2 = sample.int(2, N, replace = TRUE),
    ctrl_3 = sample.int(4, N, replace = TRUE),
    ctrl_4 = sample.int(7, N, replace = TRUE),
    ctrl_5 = sample.int(7, N, replace = TRUE),
    ctrl_6 = sample.int(2, N, replace = TRUE),
    rand1  =  rbinom(N,1, .15), # We'll use these to create weaker treatment effects.
    rand2  =  rbinom(N,1, .45),
    rand3  =  rbinom(N,1, .65),
    attrition =  rbinom(N, 1, .1),  #### WE WILL ASSUME 10% Attrition. 
    study = 1)

## Moderate direct, moderate interaction + ceiling. 
  dat <- dat %>% mutate(SI_1 = ifelse(T1 ==1, SI_1 + rand2, SI_1)) %>%
    mutate(SI_1 = ifelse(T1 ==1 & T2==1, SI_1 + rand2, SI_1)) %>%
    mutate(SI_1 = ifelse(SI_1 > 7, 7, SI_1)) 

## Null direct, strong interaction + ceiling 
  dat <- dat %>% mutate(SI_2 = ifelse(T1 ==1 & T2==1, SI_2 + 1, SI_2)) %>%
    mutate(SI_2 = ifelse(SI_2 > 7, 7, SI_2)) 


## weak direct, strong interaction, moderate confounding T2 
  dat <- dat %>% mutate(SI_3 = ifelse(T1 ==1, SI_3 + rand1, SI_3)) %>%
    mutate(SI_3 = ifelse(T1 ==1 & T2==1, SI_3 + 1, SI_3)) %>%
    mutate(SI_3 = ifelse(SI_3 > 7, 7, SI_3))  %>%
    mutate(SI_3 = ifelse(SI_3 < 1 , 1, SI_3)) 

## Moderate direct, moderate interaction  
  dat <- dat %>% mutate(SI_4 = ifelse(T1 ==1, SI_4 + rand3, SI_4)) %>%
    mutate(SI_4 = ifelse(T1 ==1 & T2==1, SI_4 + rand2, SI_4)) %>%
    mutate(SI_4 = ifelse(SI_4 > 7, 7, SI_4)) 
  

## Small NEGATIVE direct, strong interaction  
  dat <- dat %>% mutate(SI_5 = ifelse(T1 ==0, SI_5 + rand1, SI_5)) %>%
    mutate(SI_5 = ifelse(T1 ==1 & T2==1, SI_5 + 1, SI_5)) %>%
    mutate(SI_5 = ifelse(SI_5 > 7, 7, SI_5))
  
## Moderate direct, moderate interaction  
  dat <- dat %>% mutate(SI_6 = ifelse(T1 ==1, SI_6 + rand3, SI_6)) %>%
    mutate(SI_6 = ifelse(T1 ==1 & T2==1, SI_6 + rand2, SI_6)) %>%
    mutate(SI_6 = ifelse(SI_6 > 7, 7, SI_6)) 
  
#############################################
#### NOTE: WE ASSUME WEAKER EFFECTS 
###### The random variables are less likely to be 1s ####  
########### We also swap out some of the effects so the outcomes are differently 
########### treated across different studies. 
###########################################


####### Study 2 & Stacking the data:
## Many of our tests consider the net impact of our two treatments `T1, T2` upon 
## all five measures of soldier-level effectiveness `SI_1`-`SI_5`.

datstack <- tibble(ID = (N+1):(2*N),
               T1 = rbinom(N,1, .5),
               T2 = rbinom(N,1, .5),
               SI_1 = sample.int(7, N, replace = TRUE),
               SI_2 = sample.int(7, N, replace = TRUE),
               SI_3 = sample.int(7, N, replace = TRUE),
               SI_4 = sample.int(7, N, replace = TRUE),
               SI_5 = sample.int(6, N, replace = TRUE),
               SI_6 = sample.int(7, N, replace = TRUE),
               MN_1 = sample.int(7, N, replace = TRUE),
               MT_1 = sample.int(7, N, replace = TRUE),
               AM_1 = sample.int(7, N, replace = TRUE),
               AM_2 = sample.int(7, N, replace = TRUE),
               AM_3 = sample.int(7, N, replace = TRUE),
               rand1  =  rbinom(N,1, .2), # Weaker effects.
               rand2  =  rbinom(N,1, .35),
               rand3  =  rbinom(N,1, .4),
               attrition =  rbinom(N, 1, .1),
               study = 2) %>%  mutate(SI_1 = ifelse(T1 ==1, SI_1 + rand2, SI_1)) %>%
  mutate(SI_1 = ifelse(T1 ==1 & T2==1, SI_1 + rand2, SI_1)) %>%
  mutate(SI_1 = ifelse(SI_1 > 7, 7, SI_1)) %>%
  mutate(SI_2 = ifelse(T1 ==1 & T2==1, SI_2 + rand3, SI_2)) %>%
  mutate(SI_2 = ifelse(SI_2 > 7, 7, SI_2))  %>% 
  mutate(SI_3 = ifelse(T1 ==0, SI_3 + rand1, SI_3)) %>%
  mutate(SI_3 = ifelse(T1 ==1 & T2==1, SI_3 + rand2, SI_3)) %>%
  mutate(SI_3 = ifelse(SI_3 > 7, 7, SI_3)) %>% 
  mutate(SI_4 = ifelse(T1 ==1, SI_4 + rand1, SI_4)) %>%
  mutate(SI_4 = ifelse(T2==1, SI_4 - rand2, SI_4)) %>%
  mutate(SI_4 = ifelse(SI_4 > 7, 7, SI_4))  %>%
  mutate(SI_4 = ifelse(SI_4 < 1 , 1, SI_4))  %>% 
  mutate(SI_5 = ifelse(T1 ==1, SI_5 + rand3, SI_5)) %>%
  mutate(SI_5 = ifelse(T1 ==1 & T2==1, SI_5 + rand3, SI_5)) %>%
  mutate(SI_5 = ifelse(SI_5 > 7, 7, SI_5)) %>%
  mutate(SI_6 = ifelse(T1 ==1, SI_6 + rand1, SI_6)) %>%
  mutate(SI_6 = ifelse(T2==1, SI_6 - rand2, SI_6)) %>%
  mutate(SI_6 = ifelse(SI_6 > 7, 7, SI_6))  %>%
  mutate(SI_6 = ifelse(SI_6 < 1 , 1, SI_6))  %>% 
  bind_rows(dat)  %>%
  mutate(T1 = ifelse(attrition ==1, NA, T1)) %>%  
  mutate(study = as.factor(study)) %>%  
  select(ID, T1:T2, SI_1:SI_6, MT_1, MN_1, AM_1:AM_3, HTE_1:HTE_6, ctrl_1:ctrl_6, study) %>%
  group_by(ID) %>% 
  pivot_longer(cols = SI_1:SI_6, 
               names_to = "Variable", 
               values_to = "Value",
               values_drop_na = F) 

```

Here we present results of power analysis for H1 and H2 tests with global models. We conduct the power analysis in the following manner: (1) We randomly draw a sample from the data simulated in the same manner adopted in Appendix B. We vary  the sample size from 50 to 1,000. (2) We run a stacked regression model with the randomly sampled data to calculate statistical power. (3) We repeat this 500 times. (4) We then average the 500 values of statistical power for each sample size. (5) We plot a graph that shows how power changes as sample size increases. The size of the sample we aim to recruit (n = 700) yields statistical power higher than the conventional standard of .8 (dashed line in graphs). The statistical significance level is set at .95. Effect size used for power analysis is set at .2, in order to represent moderate effect size and also reflect the simulation results presented in Appendix B.

```{r}
## H1

pwr.function1 <- function(effect, sample_size) {

  sig_H1 <- c()

  for (i in 1:500) {
  pop <- data.frame(1:2000)
  samp <- sample_n(pop, sample_size)
  dat_sample <- filter(datstack, ID %in% samp$X1.2000)

  H1_pwr <- lm_robust(Value ~ T1 + Variable + study,  
                clusters = ID,
                data = dat_sample)
    
  sig_H1[i] <- tidy(H1_pwr)$p.value[2] <= .05

  }

  sig_H1 %>%
    mean(na.rm = TRUE) %>%
    return()

}

pwr_lv1 <- c()

samp_size <- c(50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 
               700, 750, 800, 850, 900, 950, 1000)

for (i in 1:20) {
  pwr_lv1[i] <- pwr.function1(.2, samp_size[i])
}

pwr1 <- tibble(sample = samp_size,
              power = pwr_lv1)



## H2

pwr.function2 <- function(effect, sample_size) {

  sig_H2 <- c()
    
  for (i in 1:500) {
  pop <- data.frame(1:2000)
  samp <- sample_n(pop, sample_size)
  dat_sample <- filter(datstack, ID %in% samp$X1.2000)

  H2_pwr <-  lm_robust(Value ~ T1*T2 + Variable + study,  
                clusters = ID,
                data = dat_sample)
    
  sig_H2[i] <- tidy(H2_pwr)$p.value[9] <= .05
  
  }

  sig_H2 %>%
    mean(na.rm = TRUE) %>%
    return()

}

pwr_lv2 <- c()

for (i in 1:20) {
  pwr_lv2[i] <- pwr.function2(.2, samp_size[i])
}

pwr2 <- tibble(sample = samp_size,
              power = pwr_lv2)



## Power Analysis Plots

ggplot(pwr1, 
       aes(x = sample, y = power)) + geom_line() +
       geom_hline(aes(yintercept = .8), linetype = "dashed") + 
       theme_classic() + 
       scale_y_continuous(labels = scales::percent) + 
       labs(x = "Sample Size", y = "Power")

ggplot(pwr2, 
       aes(x = sample, y = power)) + geom_line() +
       geom_hline(aes(yintercept = .8), linetype = "dashed") + 
       theme_classic() + 
       scale_y_continuous(labels = scales::percent) + 
       labs(x = "Sample Size", y = "Power")
```